Loading the required packages
library(tidyverse)
library(dplyr)
library(ggplot2)
library(rtweet)
library(readr)
library(DataExplorer)
library(ggcute)
Import processed data, which can be found here.
#read preprocessed data
wines <- read.csv(file = '../data/processed_data/wines.csv')
Get sample of dataset
#set seed value to birthday of Ricardo Rodriguez, American wrestler and ring announcer and Dr. Reinaldo (Rei) Sanchez-Arias
set.seed(19630217)
#set percentage to test with for simplicity, if needed
percentage <- 5
wine_sample<- sample_n(wines, percentage/100*nrow(wines))
tasters <- wines %>%
select(taster_name, taster_twitter_handle) %>% unique()
tasters
Drop taster_twitter_handle in wines dataframe
wines <- wines %>%
select(-taster_twitter_handle)
head(wines)
Each reviewer has there own bias. To offset that we made a “profile” for each reviewer which includes characteristics like: avg_points, sd_points, and var_points
taster_rating_profile <- wines %>%
group_by(taster_name) %>%
summarize(
avg_points = mean(points),
sd_points = sd(points),
var_points = var(points),
reviews = n()
)
tasters <- inner_join(tasters, taster_rating_profile, by = "taster_name")
head(tasters)
Add following classification to wine dataset as found on the website:
| Category | Rating | Description |
|---|---|---|
| Classic | 98-100 | The pinnacle of quality. |
| Superb | 94-97 | A great achievement. |
| Excellent | 90-93 | Highly recommended. |
| Very Good | 87-89 | Often good value; well recommended. |
| Good | 83-86 | Suitable for everyday consumption; often good value. |
| Acceptable | 80-82 | Can be employed in casual, less-critical circumstances |
# function to add rating
rating_category <- function(points){
if(points>=98){
return("Classic")
}
else if (points>=94){
return("Superb")
}
else if(points>=90){
return("Excellent")
}
else if(points>=87){
return("Very Good")
}
else if(points>=83){
return("Good")
}
else{
return("Acceptable")
}
}
wines<- wines %>%
rowwise() %>%
mutate(rating_category = rating_category(points))
head(wines)
Since, each reviewer has a different bias we created a normalized metric, norm_points, by looking at the number of standard deviatioins a wine is from the reviewer’s avg_points. This gives use a more accurate representation of which which wines are better than the rest.
normalize_points <- function(data){
left_join(data, tasters, by = "taster_name")%>%
rowwise() %>%
mutate(norm_points = (points-avg_points)/sd_points) %>%
select(-avg_points, -sd_points, -var_points, -taster_twitter_handle, -reviews)
}
wines <- normalize_points(wines)
head(wines)
Vintage seems to have year 7200, so we filtered all data upto 2019
wines <- wines %>%
filter(vintage<2019)
TODO: EXPLAIN GRAPH AND WHAT WE ARE DOING HERE (OSAKI)
wines %>%
group_by(alcohol) %>%
ggplot(mapping = aes(x = alcohol)) +
geom_histogram(na.rm = T,
bins = 50) +
scale_x_continuous(
name = "Alchohol Percentage",
breaks = seq(0,25,1),
limits = c(4,22)) +
scale_fill_fairyfloss() +
theme_fairyfloss() +
theme(plot.background = element_rect("white"))
Grouping rowwise data frame strips rowwise nature
Understanding what vintage the reivewed wines were from.
wines %>%
group_by(vintage) %>%
summarise(count = n())
Grouping rowwise data frame strips rowwise nature
TODO: EXPLAIN GRPAHS (IZZY) (Note: Data points before 1990 have been omitted for clarity in visualization)
wines %>%
group_by(vintage) %>%
ggplot() +
geom_bar(mapping = aes(x=vintage),
na.rm = T) +
scale_x_continuous(breaks = seq(1990,2019,5),
limits = c(1990,2019)) +
labs(x = "Vintage", y = "Count") +
theme_fairyfloss()
Grouping rowwise data frame strips rowwise nature
To better understand the number wines per winery, we did a univarite visualization that counts the number of wines per winery showing only 15 winerys to give you an idea what winery has the most selction of wines.
wines %>%
group_by(winery) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
slice(1:15) %>%
ggplot() +
geom_col(mapping = aes(x=count, y = reorder(winery, count)))
Grouping rowwise data frame strips rowwise nature
To better understand the number wines per province, we did a univarite visualization that counts the number of wines per province showing only the top 10 provinces with the most wines. This can give the reader an idea where their wine will most likely be made with California standing out as a clear leader.
wines %>%
group_by(province) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
slice(1:10) %>%
ggplot()+
geom_col(aes(x = count, y = reorder(province, count)))
Grouping rowwise data frame strips rowwise nature
Calculating the Mean, Standard Deviation, Minimum, and Max Price for the entire wine dataset and printing the values.
mean_price <- mean(wines$price, na.rm = TRUE)
sd_price <- sd(wines$price, na.rm = TRUE)
min_price <- min(wines$price, na.rm = TRUE)
max_price <- max(wines$price, na.rm = TRUE)
print(paste("Mean Price:", mean_price))
[1] "Mean Price: 35.4748507788616"
print(paste("SD Price:", sd_price))
[1] "SD Price: 41.238007633635"
print(paste("Min Price:", min_price))
[1] "Min Price: 4"
print(paste("Max Price:", max_price))
[1] "Max Price: 3300"
TODO: Vamsi
wines %>%
filter(price < 1000) %>%
group_by(price) %>%
summarise(count = n()) %>%
ggplot() +
geom_histogram(
mapping = aes(x=price),
na.rm = T)
Grouping rowwise data frame strips rowwise nature
Calculating the Mean, Standard Deviation, Minimum, and Max Points for the entire wine dataset and printing the values.
mean_points <- mean(wines$points, na.rm = TRUE)
sd_points <- sd(wines$points, na.rm = TRUE)
min_points <- min(wines$points, na.rm = TRUE)
max_points <- max(wines$points, na.rm = TRUE)
print(paste("Mean Points:", mean_points))
[1] "Mean Points: 88.4744820916541"
print(paste("SD Points:", sd_points))
[1] "SD Points: 3.05417480898736"
print(paste("Min Points:", min_points))
[1] "Min Points: 80"
print(paste("Max Points:", max_points))
[1] "Max Points: 100"
TODO: EXPLAIN GRPAH OSAKI
wines %>%
ggplot() +
geom_histogram(
mapping = aes(x=points),
bins = 20)
To help you understand the point distribution by reviewers, we did a multivarite visualization that coorelates some taster names based on the average wine points as identified by the x-intercept. This give you the reader an idea of how some reviewers correlate to the overall average.
wines %>%
ggplot() +
geom_boxplot(aes(y=taster_name, x=points)) +
geom_vline(xintercept = mean(wines$points))
TODO EXPLAIN VAMSI Notice the data is “stacked” and the socres range from 80-100
wines %>%
ggplot() +
geom_point(mapping = (aes(x = points, y = price)), na.rm = T, alpha = 0.15) +
labs(title = "Price by Points", x = "Points", y = "Price")
TODO EXPLAIN VAMSI
wines %>%
ggplot() +
geom_point(mapping = (aes(x = points, y = log(price))), na.rm = T, alpha = 0.15) +
labs(title = "log(Price) by Points", x = "Points", y = "log(Price)")
TODO EXPLAIN IZZY This
wines %>%
group_by(points) %>%
filter(price < 1000) %>%
ggplot() +
geom_point(mapping = aes(x=points, y = price, color = category),
na.rm = T) +
facet_wrap(~ category) +
scale_color_fairyfloss() +
theme_minimal() +
theme(legend.position = "none")
Grouping rowwise data frame strips rowwise nature
TODO EXPLAIN OSAKI To determine the best province for wine by points we average all the wines per province and return the top 10 with standard error.
wines %>%
group_by(province) %>%
summarise(avg_points_prov = mean(points), count = n(), std_points_prov_err = sd(points)/sqrt(count)) %>%
filter(count>30) %>%
arrange(desc(avg_points_prov)) %>%
slice(1:10) %>%
ggplot() +
geom_col(mapping = aes(y=province, x= avg_points_prov)) +
geom_errorbar(
mapping = aes(
y = province,
x = avg_points_prov,
xmin = avg_points_prov - std_points_prov_err,
xmax = avg_points_prov + std_points_prov_err
),
width = 0.2
)
Grouping rowwise data frame strips rowwise nature
To determine the best variety of wine we use the average point of all wines per variety with a sample size greater than 30. The graph below shows the the top 10 varieties with their respective standard error.
wines %>%
group_by(variety) %>%
summarise(
avg_points_variety = mean(points),
count = n(),
sd_err_points_variety = sd(points)/sqrt(count)) %>%
filter(count>30) %>%
arrange(desc(avg_points_variety)) %>%
slice(1:10) %>%
ggplot() +
geom_col(mapping = aes(y=variety, x=avg_points_variety))+
geom_errorbar(
mapping = aes(
y = variety,
x = avg_points_variety,
xmin = avg_points_variety - sd_err_points_variety,
xmax = avg_points_variety + sd_err_points_variety
),
width = 0.2
)
Grouping rowwise data frame strips rowwise nature
TODO IZZY
user_price <- readline(prompt = "How much are you willing to spend on a bottle?")
user_price <- as.integer(user_price)
wines %>%
filter(price <= user_price) %>%
arrange(desc(points)) %>%
select(title, price, points)
A easy way to determine the best wine is by simply finding the top 10 wines.
wines %>%
arrange(desc(points)) %>%
slice(1:10)
However, this does not account for the graders bias. Instead, our group “normalized” the points based on each taster based on the number of standard deviations an wines is from the raters average. For example, Taster A could give a wine 100 but has an avgerage rating score of 95 with a standard deviation of 5. Whereas, Taster B could give a wine 91 and have an average score of 87 with a standard deviation of 2. Although, the wine tasted by Taster A got a perfect 100 score, Taster B’s wine was much “better” wine since it was 2 standard deviations from the tasters avgerage compared to 1 standard deviation of the other wine.
Looking at the norm_points these are the top 10 best wines
wines %>%
arrange(desc(norm_points)) %>%
slice(1:10)